if(!require("pacman")) install.packages("pacman")
## Loading required package: pacman
pacman::p_load(tidyverse,tidytext,glue,stringr,tm,zoo)
First, we load in ‘Hamlet’ and do some cleaning of the text.
No need to remove any words.
text = glue(read_file("Hamlet.txt"))
tokens = tibble(text = tolower(text)) %>% unnest_tokens(word, text)
tokens
## # A tibble: 32,197 x 1
## word
## <chr>
## 1 act
## 2 i
## 3 scene
## 4 i
## 5 elsinore
## 6 a
## 7 platform
## 8 before
## 9 the
## 10 castle
## # ... with 32,187 more rows
We then do the following with our tokens:
sentiments = tokens %>%
inner_join(get_sentiments("bing")) %>%
count(sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate("+/-" = positive - negative) %>%
gather()
sentiments
## # A tibble: 3 x 2
## key value
## <chr> <dbl>
## 1 negative 1290
## 2 positive 1233
## 3 +/- - 57.0
Now we can plot the results!
sentimentsBars = ggplot(sentiments, aes(x=key, y=value)) +
geom_bar(stat="identity",fill="darkred")
sentimentsBars
But what’s in ‘bing’ and the other available sentiment lexica?
set.seed(7693)
# bing
get_sentiments("bing")[sample(nrow(get_sentiments("bing")),10),]
## # A tibble: 10 x 2
## word sentiment
## <chr> <chr>
## 1 mendacity negative
## 2 faultless positive
## 3 pitiless negative
## 4 fast-paced positive
## 5 superiority positive
## 6 resplendent positive
## 7 blasphemy negative
## 8 insensitively negative
## 9 uproot negative
## 10 reproach negative
# nrc
get_sentiments("nrc")[sample(nrow(get_sentiments("nrc")),10),]
## # A tibble: 10 x 2
## word sentiment
## <chr> <chr>
## 1 assessment trust
## 2 moribund sadness
## 3 villainous disgust
## 4 thirteenth fear
## 5 loom anticipation
## 6 shot surprise
## 7 influential trust
## 8 fancy positive
## 9 engaged positive
## 10 regression negative
# afinn
get_sentiments("afinn")[sample(nrow(get_sentiments("afinn")),10),]
## # A tibble: 10 x 2
## word score
## <chr> <int>
## 1 degrade -2
## 2 bereaved -2
## 3 deceit -3
## 4 prblm -2
## 5 envy -1
## 6 wrong -2
## 7 dont like -2
## 8 advantages 2
## 9 suicidal -2
## 10 disparaged -2
# loughran
get_sentiments("loughran")[sample(nrow(get_sentiments("loughran")),10),]
## # A tibble: 10 x 2
## word sentiment
## <chr> <chr>
## 1 collapse negative
## 2 chattels litigious
## 3 mistakes negative
## 4 exaggerating negative
## 5 enabling positive
## 6 antitrust negative
## 7 jurisdictional litigious
## 8 abrogate litigious
## 9 delinquency negative
## 10 impair negative
unique(get_sentiments("loughran")$sentiment)
## [1] "negative" "positive" "uncertainty" "litigious"
## [5] "constraining" "superfluous"
playList = list.files(pattern=".txt")
df = tibble()
for (play in playList) {
text = glue(read_file(play))
tokens = tibble(text = tolower(text)) %>% unnest_tokens(word, text)
sentiments = tokens %>%
inner_join(get_sentiments("bing")) %>%
count(sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate("+/-" = positive - negative) %>%
gather() %>%
mutate(Play = play)
df = rbind(df,sentiments)
}
sentimentsBarsAll = ggplot(df, aes(x=key, y=value)) +
geom_bar(stat="identity", fill = "darkred") +
facet_wrap(~gsub(".txt","",Play))
sentimentsBarsAll
We first add row numbers and then create a variable called polarity with values -1 to 1 corresponding to “negative”" and “positive”.
We then use this variable to create another variable containing the “rolling mean” of 50 sentiment words.
for (play in playList){
text=glue(read_file(play))
tokens = tibble(text = tolower(text)) %>% unnest_tokens(word, text)
tokens = rowid_to_column(tokens, "ID") # create row numbers
sentiments = tokens %>%
inner_join(get_sentiments("bing"))
sentiments$polarity = c()
sentiments$polarity[sentiments$sentiment=="negative"] = -1
sentiments$polarity[sentiments$sentiment=="positive"] = 1
rollMean<-rollmean(sentiments$polarity, 50,fill = list(NA, NULL, NA))
sentiments$rollMean=rollMean
plot = ggplot(sentiments) +
aes(ID,polarity, fill= sentiment) +
geom_col() +
geom_line(aes(ID,rollMean)) +
ggtitle(gsub(".txt","",play)) +
theme_minimal()
print(plot)
}
Each bar represents a negative or positive word in chronological order.
The lines represent the rolling means.
for (play in playList){
text=glue(read_file(play))
tokens = tibble(text = tolower(text)) %>% unnest_tokens(word, text)
tokens = rowid_to_column(tokens, "ID")
sentiments = tokens %>%
inner_join(get_sentiments("bing"))
sentiments$polarity = NULL
sentiments$polarity[sentiments$sentiment=="negative"] = -1
sentiments$polarity[sentiments$sentiment=="positive"] = 1
means=colMeans(matrix(sentiments$polarity, nrow=30))
df=tibble(row=seq(1:length(means)),means)
plot = df %>%
ggplot() +
ggtitle(gsub(".txt","",play)) +
theme_dark()+
aes(row,means,fill=means) +
geom_col() +
ylim(-1,1) +
scale_fill_gradient2(low = "red", mid = "white",
high = "blue", midpoint = 0, space = "Lab",
na.value = "grey50", guide = "colourbar")
print(plot)
}
library(wordcloud2)
df = tibble()
for (play in playList) {
text = glue(read_file(play))
tokens = tibble(text = tolower(text)) %>% unnest_tokens(word, text)
df=rbind(df,tokens)
}
sentiments = df %>%
group_by(word) %>%
count(word) %>%
inner_join(get_sentiments("bing")) %>%
arrange(desc(n))
set.seed(1112)
wordcloud2(sentiments, figPath = "silh2.png", size = 1.5, color = "snow", backgroundColor="black")
knitr::include_graphics("sentimentCLoud.png")